1 Exploratory Data Analysis

## ============================= STANDARD EDA TECHNIQUES
## ==============================

## <<<< READING IN DATA >>> ===== FULL DATASET ===== bill.data.test <-
## read.csv('Bills.subset.test.csv', header=TRUE, sep=',', na.strings='') #
## accounts for header, CSV, and na strings
df.full <- read.csv("diabetic.data.csv", header = TRUE, sep = ",", na.strings = "")  # accounts for header, CSV, and na strings
dim(df.full)  # 101,766 observations x 50 variables
## [1] 101766     50
head(df.full, 30)
##    encounter_id patient_nbr            race gender      age weight
## 1       2278392     8222157       Caucasian Female   [0-10)      ?
## 2        149190    55629189       Caucasian Female  [10-20)      ?
## 3         64410    86047875 AfricanAmerican Female  [20-30)      ?
## 4        500364    82442376       Caucasian   Male  [30-40)      ?
## 5         16680    42519267       Caucasian   Male  [40-50)      ?
## 6         35754    82637451       Caucasian   Male  [50-60)      ?
## 7         55842    84259809       Caucasian   Male  [60-70)      ?
## 8         63768   114882984       Caucasian   Male  [70-80)      ?
## 9         12522    48330783       Caucasian Female  [80-90)      ?
## 10        15738    63555939       Caucasian Female [90-100)      ?
## 11        28236    89869032 AfricanAmerican Female  [40-50)      ?
## 12        36900    77391171 AfricanAmerican   Male  [60-70)      ?
## 13        40926    85504905       Caucasian Female  [40-50)      ?
## 14        42570    77586282       Caucasian   Male  [80-90)      ?
## 15        62256    49726791 AfricanAmerican Female  [60-70)      ?
## 16        73578    86328819 AfricanAmerican   Male  [60-70)      ?
## 17        77076    92519352 AfricanAmerican   Male  [50-60)      ?
## 18        84222   108662661       Caucasian Female  [50-60)      ?
## 19        89682   107389323 AfricanAmerican   Male  [70-80)      ?
## 20       148530    69422211               ?   Male  [70-80)      ?
## 21       150006    22864131               ? Female  [50-60)      ?
## 22       150048    21239181               ?   Male  [60-70)      ?
## 23       182796    63000108 AfricanAmerican Female  [70-80)      ?
## 24       183930   107400762       Caucasian Female  [80-90)      ?
## 25       216156    62718876 AfricanAmerican Female  [70-80)      ?
## 26       221634    21861756           Other Female  [50-60)      ?
## 27       236316    40523301       Caucasian   Male  [80-90)      ?
## 28       248916   115196778       Caucasian Female  [50-60)      ?
## 29       250872    41606064       Caucasian   Male  [20-30)      ?
## 30       252822    18196434       Caucasian Female  [80-90)      ?
##    admission_type_id discharge_disposition_id admission_source_id
## 1                  6                       25                   1
## 2                  1                        1                   7
## 3                  1                        1                   7
## 4                  1                        1                   7
## 5                  1                        1                   7
## 6                  2                        1                   2
## 7                  3                        1                   2
## 8                  1                        1                   7
## 9                  2                        1                   4
## 10                 3                        3                   4
## 11                 1                        1                   7
## 12                 2                        1                   4
## 13                 1                        3                   7
## 14                 1                        6                   7
## 15                 3                        1                   2
## 16                 1                        3                   7
## 17                 1                        1                   7
## 18                 1                        1                   7
## 19                 1                        1                   7
## 20                 3                        6                   2
## 21                 2                        1                   4
## 22                 2                        1                   4
## 23                 2                        1                   4
## 24                 2                        6                   1
## 25                 3                        1                   2
## 26                 1                        1                   7
## 27                 1                        3                   7
## 28                 1                        1                   1
## 29                 2                        1                   2
## 30                 1                        2                   7
##    time_in_hospital payer_code        medical_specialty num_lab_procedures
## 1                 1          ? Pediatrics-Endocrinology                 41
## 2                 3          ?                        ?                 59
## 3                 2          ?                        ?                 11
## 4                 2          ?                        ?                 44
## 5                 1          ?                        ?                 51
## 6                 3          ?                        ?                 31
## 7                 4          ?                        ?                 70
## 8                 5          ?                        ?                 73
## 9                13          ?                        ?                 68
## 10               12          ?         InternalMedicine                 33
## 11                9          ?                        ?                 47
## 12                7          ?                        ?                 62
## 13                7          ?   Family/GeneralPractice                 60
## 14               10          ?   Family/GeneralPractice                 55
## 15                1          ?                        ?                 49
## 16               12          ?                        ?                 75
## 17                4          ?                        ?                 45
## 18                3          ?               Cardiology                 29
## 19                5          ?                        ?                 35
## 20                6          ?                        ?                 42
## 21                2          ?                        ?                 66
## 22                2          ?                        ?                 36
## 23                2          ?                        ?                 47
## 24               11          ?                        ?                 42
## 25                3          ?                        ?                 19
## 26                1          ?                        ?                 33
## 27                6          ?               Cardiology                 64
## 28                2          ?          Surgery-General                 25
## 29               10          ?                        ?                 53
## 30                5          ?               Cardiology                 52
##    num_procedures num_medications number_outpatient number_emergency
## 1               0               1                 0                0
## 2               0              18                 0                0
## 3               5              13                 2                0
## 4               1              16                 0                0
## 5               0               8                 0                0
## 6               6              16                 0                0
## 7               1              21                 0                0
## 8               0              12                 0                0
## 9               2              28                 0                0
## 10              3              18                 0                0
## 11              2              17                 0                0
## 12              0              11                 0                0
## 13              0              15                 0                1
## 14              1              31                 0                0
## 15              5               2                 0                0
## 16              5              13                 0                0
## 17              4              17                 0                0
## 18              0              11                 0                0
## 19              5              23                 0                0
## 20              2              23                 0                0
## 21              1              19                 0                0
## 22              2              11                 0                0
## 23              0              12                 0                0
## 24              2              19                 0                0
## 25              4              18                 0                0
## 26              0               7                 0                0
## 27              3              18                 0                0
## 28              2              11                 0                0
## 29              0              20                 0                0
## 30              0              14                 0                0
##    number_inpatient diag_1 diag_2 diag_3 number_diagnoses max_glu_serum
## 1                 0 250.83      ?      ?                1          None
## 2                 0    276 250.01    255                9          None
## 3                 1    648    250    V27                6          None
## 4                 0      8 250.43    403                7          None
## 5                 0    197    157    250                5          None
## 6                 0    414    411    250                9          None
## 7                 0    414    411    V45                7          None
## 8                 0    428    492    250                8          None
## 9                 0    398    427     38                8          None
## 10                0    434    198    486                8          None
## 11                0  250.7    403    996                9          None
## 12                0    157    288    197                7          None
## 13                0    428 250.43  250.6                8          None
## 14                0    428    411    427                8          None
## 15                0    518    998    627                8          None
## 16                0    999    507    996                9          None
## 17                0    410    411    414                8          None
## 18                0    682    174    250                3          None
## 19                0    402    425    416                9          None
## 20                0    737    427    714                8          None
## 21                0    410    427    428                7          None
## 22                0    572    456    427                6          None
## 23                0    410    401    582                8          None
## 24                0    V57    715    V43                8          None
## 25                0    189    496    427                6          None
## 26                0    786    401    250                3          None
## 27                0    427    428    414                7          None
## 28                0    996    585 250.01                3          None
## 29                0    277 250.02    263                6          None
## 30                0    428    410    414                8          None
##    A1Cresult metformin repaglinide nateglinide chlorpropamide glimepiride
## 1       None        No          No          No             No          No
## 2       None        No          No          No             No          No
## 3       None        No          No          No             No          No
## 4       None        No          No          No             No          No
## 5       None        No          No          No             No          No
## 6       None        No          No          No             No          No
## 7       None    Steady          No          No             No      Steady
## 8       None        No          No          No             No          No
## 9       None        No          No          No             No          No
## 10      None        No          No          No             No          No
## 11      None        No          No          No             No          No
## 12      None        No          No          No             No          No
## 13      None    Steady          Up          No             No          No
## 14      None        No          No          No             No          No
## 15      None        No          No          No             No          No
## 16      None        No          No          No             No          No
## 17      None        No          No          No             No          No
## 18      None        No          No          No             No          No
## 19      None        No          No          No             No          No
## 20      None        No          No          No             No          No
## 21      None        No          No          No             No          No
## 22      None    Steady          No          No             No      Steady
## 23      None        No          No          No             No          No
## 24      None        No          No          No             No          No
## 25      None        No          No          No             No          No
## 26      None    Steady          No          No             No          No
## 27        >7    Steady          No          No             No          No
## 28      None        No          No          No             No          No
## 29      None        No          No          No             No          No
## 30      None    Steady          No          No             No          No
##    acetohexamide glipizide glyburide tolbutamide pioglitazone
## 1             No        No        No          No           No
## 2             No        No        No          No           No
## 3             No    Steady        No          No           No
## 4             No        No        No          No           No
## 5             No    Steady        No          No           No
## 6             No        No        No          No           No
## 7             No        No        No          No           No
## 8             No        No    Steady          No           No
## 9             No    Steady        No          No           No
## 10            No        No        No          No           No
## 11            No        No        No          No           No
## 12            No        No        Up          No           No
## 13            No        No        No          No           No
## 14            No        No        No          No           No
## 15            No        No        No          No           No
## 16            No        No        No          No           No
## 17            No    Steady        No          No           No
## 18            No        No    Steady          No           No
## 19            No        No        No          No           No
## 20            No        No      Down          No           No
## 21            No        No        No          No           No
## 22            No        No        No          No           No
## 23            No        No        No          No           No
## 24            No        No        No          No           No
## 25            No    Steady        No          No           No
## 26            No        No        No          No           No
## 27            No        No    Steady          No           No
## 28            No        No        No          No           No
## 29            No        No        No          No           No
## 30            No        No    Steady          No           No
##    rosiglitazone acarbose miglitol troglitazone tolazamide examide
## 1             No       No       No           No         No      No
## 2             No       No       No           No         No      No
## 3             No       No       No           No         No      No
## 4             No       No       No           No         No      No
## 5             No       No       No           No         No      No
## 6             No       No       No           No         No      No
## 7             No       No       No           No         No      No
## 8             No       No       No           No         No      No
## 9             No       No       No           No         No      No
## 10        Steady       No       No           No         No      No
## 11            No       No       No           No         No      No
## 12            No       No       No           No         No      No
## 13            No       No       No           No         No      No
## 14            No       No       No           No         No      No
## 15            No       No       No           No         No      No
## 16            No       No       No           No         No      No
## 17            No       No       No           No         No      No
## 18            No       No       No           No         No      No
## 19            No       No       No           No         No      No
## 20            No       No       No           No         No      No
## 21            No       No       No           No         No      No
## 22            No       No       No           No         No      No
## 23            No       No       No           No         No      No
## 24            No       No       No           No         No      No
## 25            No       No       No           No         No      No
## 26            No       No       No           No         No      No
## 27            No       No       No           No         No      No
## 28            No       No       No           No         No      No
## 29            No       No       No           No         No      No
## 30            No       No       No           No         No      No
##    citoglipton insulin glyburide.metformin glipizide.metformin
## 1           No      No                  No                  No
## 2           No      Up                  No                  No
## 3           No      No                  No                  No
## 4           No      Up                  No                  No
## 5           No  Steady                  No                  No
## 6           No  Steady                  No                  No
## 7           No  Steady                  No                  No
## 8           No      No                  No                  No
## 9           No  Steady                  No                  No
## 10          No  Steady                  No                  No
## 11          No  Steady                  No                  No
## 12          No  Steady                  No                  No
## 13          No    Down                  No                  No
## 14          No  Steady                  No                  No
## 15          No  Steady                  No                  No
## 16          No      Up                  No                  No
## 17          No  Steady                  No                  No
## 18          No      No                  No                  No
## 19          No  Steady                  No                  No
## 20          No  Steady                  No                  No
## 21          No    Down                  No                  No
## 22          No  Steady                  No                  No
## 23          No      No                  No                  No
## 24          No      No                  No                  No
## 25          No  Steady                  No                  No
## 26          No      No                  No                  No
## 27          No      No                  No                  No
## 28          No  Steady                  No                  No
## 29          No    Down                  No                  No
## 30          No      No                  No                  No
##    glimepiride.pioglitazone metformin.rosiglitazone metformin.pioglitazone
## 1                        No                      No                     No
## 2                        No                      No                     No
## 3                        No                      No                     No
## 4                        No                      No                     No
## 5                        No                      No                     No
## 6                        No                      No                     No
## 7                        No                      No                     No
## 8                        No                      No                     No
## 9                        No                      No                     No
## 10                       No                      No                     No
## 11                       No                      No                     No
## 12                       No                      No                     No
## 13                       No                      No                     No
## 14                       No                      No                     No
## 15                       No                      No                     No
## 16                       No                      No                     No
## 17                       No                      No                     No
## 18                       No                      No                     No
## 19                       No                      No                     No
## 20                       No                      No                     No
## 21                       No                      No                     No
## 22                       No                      No                     No
## 23                       No                      No                     No
## 24                       No                      No                     No
## 25                       No                      No                     No
## 26                       No                      No                     No
## 27                       No                      No                     No
## 28                       No                      No                     No
## 29                       No                      No                     No
## 30                       No                      No                     No
##    change diabetesMed readmitted
## 1      No          No         NO
## 2      Ch         Yes        >30
## 3      No         Yes         NO
## 4      Ch         Yes         NO
## 5      Ch         Yes         NO
## 6      No         Yes        >30
## 7      Ch         Yes         NO
## 8      No         Yes        >30
## 9      Ch         Yes         NO
## 10     Ch         Yes         NO
## 11     No         Yes        >30
## 12     Ch         Yes        <30
## 13     Ch         Yes        <30
## 14     No         Yes         NO
## 15     No         Yes        >30
## 16     Ch         Yes         NO
## 17     Ch         Yes        <30
## 18     No         Yes         NO
## 19     No         Yes        >30
## 20     Ch         Yes         NO
## 21     Ch         Yes         NO
## 22     Ch         Yes         NO
## 23     No          No         NO
## 24     No          No        >30
## 25     Ch         Yes         NO
## 26     No         Yes         NO
## 27     Ch         Yes         NO
## 28     No         Yes        >30
## 29     Ch         Yes        >30
## 30     Ch         Yes        >30
# View(df.full) summary(df.full)
summary(df.full$readmitted)
##   <30   >30    NO 
## 11357 35545 54864
# ====== CLEANED DATASET ====
data1 <- read.csv("diabetic.data.csv", header = TRUE, sep = ",", na.strings = "")  # accounts for header, CSV, and na strings
dim(data1)  #101766 observations x 50 variables
## [1] 101766     50
tail(data1, 20)
##        encounter_id patient_nbr            race gender     age weight
## 101747    443797298    89955270       Caucasian   Male [70-80)      ?
## 101748    443804570    33230016       Caucasian Female [70-80)      ?
## 101749    443811536   189481478       Caucasian Female [40-50)      ?
## 101750    443816024   106392411       Caucasian Female [70-80)      ?
## 101751    443824292   138784172       Caucasian Female [80-90)      ?
## 101752    443835140   175326800       Caucasian   Male [70-80)      ?
## 101753    443835512   139605341           Other Female [40-50)      ?
## 101754    443841992   184875899           Other   Male [40-50)      ?
## 101755    443842016   183087545       Caucasian Female [70-80)      ?
## 101756    443842022   188574944           Other Female [40-50)      ?
## 101757    443842070   140199494           Other Female [60-70)      ?
## 101758    443842136   181593374       Caucasian Female [70-80)      ?
## 101759    443842340   120975314       Caucasian Female [80-90)      ?
## 101760    443842778    86472243       Caucasian   Male [80-90)      ?
## 101761    443847176    50375628 AfricanAmerican Female [60-70)      ?
## 101762    443847548   100162476 AfricanAmerican   Male [70-80)      ?
## 101763    443847782    74694222 AfricanAmerican Female [80-90)      ?
## 101764    443854148    41088789       Caucasian   Male [70-80)      ?
## 101765    443857166    31693671       Caucasian Female [80-90)      ?
## 101766    443867222   175429310       Caucasian   Male [70-80)      ?
##        admission_type_id discharge_disposition_id admission_source_id
## 101747                 1                        1                   7
## 101748                 1                       22                   7
## 101749                 1                        4                   7
## 101750                 3                        6                   1
## 101751                 3                        1                   1
## 101752                 3                        6                   1
## 101753                 3                        1                   1
## 101754                 1                        1                   7
## 101755                 1                        1                   7
## 101756                 1                        1                   7
## 101757                 1                        1                   7
## 101758                 1                        1                   7
## 101759                 1                        1                   7
## 101760                 1                        1                   7
## 101761                 1                        1                   7
## 101762                 1                        3                   7
## 101763                 1                        4                   5
## 101764                 1                        1                   7
## 101765                 2                        3                   7
## 101766                 1                        1                   7
##        time_in_hospital payer_code medical_specialty num_lab_procedures
## 101747                4         MC                 ?                  2
## 101748                8         MC  InternalMedicine                 51
## 101749               14         MD                 ?                 69
## 101750                3         MC       Orthopedics                 27
## 101751                3         MD                 ?                 31
## 101752               13         MC                 ?                 77
## 101753                3         HM                 ?                 13
## 101754               13          ?                 ?                 51
## 101755                9          ?                 ?                 50
## 101756               14         MD                 ?                 73
## 101757                2         MD                 ?                 46
## 101758                5          ?                 ?                 21
## 101759                5         MC                 ?                 76
## 101760                1         MC                 ?                  1
## 101761                6         DM                 ?                 45
## 101762                3         MC                 ?                 51
## 101763                5         MC                 ?                 33
## 101764                1         MC                 ?                 53
## 101765               10         MC   Surgery-General                 45
## 101766                6          ?                 ?                 13
##        num_procedures num_medications number_outpatient number_emergency
## 101747              0               7                 1                0
## 101748              6              19                 0                0
## 101749              0              16                 0                0
## 101750              1              29                 0                1
## 101751              2              24                 0                0
## 101752              6              65                 0                0
## 101753              1               5                 0                0
## 101754              2              13                 0                0
## 101755              2              33                 0                0
## 101756              6              26                 0                1
## 101757              6              17                 1                1
## 101758              1              16                 0                0
## 101759              1              22                 0                1
## 101760              0              15                 3                0
## 101761              1              25                 3                1
## 101762              0              16                 0                0
## 101763              3              18                 0                0
## 101764              0               9                 1                0
## 101765              2              21                 0                0
## 101766              3               3                 0                0
##        number_inpatient diag_1 diag_2 diag_3 number_diagnoses
## 101747                0    427    427    250                5
## 101748                0    410    311    250                9
## 101749                0    295    305    250                5
## 101750                0    715    401    250                9
## 101751                0    574    574    250                9
## 101752                0    424    429    486               16
## 101753                0    348    784    782                8
## 101754                0  250.8    730    731                9
## 101755                0    574    574 250.02                9
## 101756                0    592    599    518                9
## 101757                1    996    585    403                9
## 101758                1    491    518    511                9
## 101759                0    292      8    304                9
## 101760                0    435    784    250                7
## 101761                2    345    438    412                9
## 101762                0 250.13    291    458                9
## 101763                1    560    276    787                9
## 101764                0     38    590    296               13
## 101765                1    996    285    998                9
## 101766                0    530    530    787                9
##        max_glu_serum A1Cresult metformin repaglinide nateglinide
## 101747          None      None        No          No          No
## 101748          None        >7        No          No          No
## 101749          None        >7        Up          No          No
## 101750          None      Norm    Steady          No          No
## 101751          None      None        No          No          No
## 101752          None      Norm        No          No          No
## 101753          None      None    Steady          No          No
## 101754          None      None    Steady          No          No
## 101755          None        >7        No          No          No
## 101756          None        >8        No          No          No
## 101757          None      None        No          No          No
## 101758          None      None        No          No          No
## 101759          None      None        No          No          No
## 101760          None      None        No          No          No
## 101761          None      None        No          No          No
## 101762          None        >8    Steady          No          No
## 101763          None      None        No          No          No
## 101764          None      None    Steady          No          No
## 101765          None      None        No          No          No
## 101766          None      None        No          No          No
##        chlorpropamide glimepiride acetohexamide glipizide glyburide
## 101747             No          No            No    Steady        No
## 101748             No          No            No        No        No
## 101749             No          No            No        No    Steady
## 101750             No          No            No    Steady        No
## 101751             No          No            No        No        No
## 101752             No          No            No        No        No
## 101753             No          No            No        No    Steady
## 101754             No          No            No        No        No
## 101755             No          No            No        No        Up
## 101756             No          No            No    Steady        No
## 101757             No          No            No        No        No
## 101758             No          No            No        No        No
## 101759             No          No            No        No        No
## 101760             No          No            No        No        No
## 101761             No          No            No        No        No
## 101762             No          No            No        No        No
## 101763             No          No            No        No        No
## 101764             No          No            No        No        No
## 101765             No          No            No    Steady        No
## 101766             No          No            No        No        No
##        tolbutamide pioglitazone rosiglitazone acarbose miglitol
## 101747          No           No            No       No       No
## 101748          No           No            No       No       No
## 101749          No           No            No       No       No
## 101750          No           No            No       No       No
## 101751          No           No            No       No       No
## 101752          No           No            No       No       No
## 101753          No           No            No       No       No
## 101754          No           No            No       No       No
## 101755          No           No            No       No       No
## 101756          No           No            No       No       No
## 101757          No           No            No       No       No
## 101758          No           No            No       No       No
## 101759          No           No            No       No       No
## 101760          No           No            No       No       No
## 101761          No           No        Steady       No       No
## 101762          No           No            No       No       No
## 101763          No           No            No       No       No
## 101764          No           No            No       No       No
## 101765          No       Steady            No       No       No
## 101766          No           No            No       No       No
##        troglitazone tolazamide examide citoglipton insulin
## 101747           No         No      No          No      No
## 101748           No         No      No          No  Steady
## 101749           No         No      No          No    Down
## 101750           No         No      No          No  Steady
## 101751           No         No      No          No    Down
## 101752           No         No      No          No      Up
## 101753           No         No      No          No  Steady
## 101754           No         No      No          No    Down
## 101755           No         No      No          No  Steady
## 101756           No         No      No          No      Up
## 101757           No         No      No          No  Steady
## 101758           No         No      No          No  Steady
## 101759           No         No      No          No      Up
## 101760           No         No      No          No      Up
## 101761           No         No      No          No    Down
## 101762           No         No      No          No    Down
## 101763           No         No      No          No  Steady
## 101764           No         No      No          No    Down
## 101765           No         No      No          No      Up
## 101766           No         No      No          No      No
##        glyburide.metformin glipizide.metformin glimepiride.pioglitazone
## 101747                  No                  No                       No
## 101748                  No                  No                       No
## 101749                  No                  No                       No
## 101750                  No                  No                       No
## 101751                  No                  No                       No
## 101752                  No                  No                       No
## 101753                  No                  No                       No
## 101754                  No                  No                       No
## 101755                  No                  No                       No
## 101756                  No                  No                       No
## 101757                  No                  No                       No
## 101758                  No                  No                       No
## 101759                  No                  No                       No
## 101760                  No                  No                       No
## 101761                  No                  No                       No
## 101762                  No                  No                       No
## 101763                  No                  No                       No
## 101764                  No                  No                       No
## 101765                  No                  No                       No
## 101766                  No                  No                       No
##        metformin.rosiglitazone metformin.pioglitazone change diabetesMed
## 101747                      No                     No     No         Yes
## 101748                      No                     No     No         Yes
## 101749                      No                     No     Ch         Yes
## 101750                      No                     No     Ch         Yes
## 101751                      No                     No     Ch         Yes
## 101752                      No                     No     Ch         Yes
## 101753                      No                     No     Ch         Yes
## 101754                      No                     No     Ch         Yes
## 101755                      No                     No     Ch         Yes
## 101756                      No                     No     Ch         Yes
## 101757                      No                     No     No         Yes
## 101758                      No                     No     No         Yes
## 101759                      No                     No     Ch         Yes
## 101760                      No                     No     Ch         Yes
## 101761                      No                     No     Ch         Yes
## 101762                      No                     No     Ch         Yes
## 101763                      No                     No     No         Yes
## 101764                      No                     No     Ch         Yes
## 101765                      No                     No     Ch         Yes
## 101766                      No                     No     No          No
##        readmitted
## 101747        <30
## 101748        >30
## 101749        >30
## 101750         NO
## 101751        <30
## 101752         NO
## 101753         NO
## 101754         NO
## 101755        >30
## 101756        >30
## 101757        >30
## 101758         NO
## 101759         NO
## 101760         NO
## 101761        >30
## 101762        >30
## 101763         NO
## 101764         NO
## 101765         NO
## 101766         NO
# head(data1, 20) View(data1)

data1 <- data1[-c(6, 11:12, 28, 30, 33, 36:41, 43:47)]  # getting rid of unhelpful vars
names(data1)
##  [1] "encounter_id"             "patient_nbr"             
##  [3] "race"                     "gender"                  
##  [5] "age"                      "admission_type_id"       
##  [7] "discharge_disposition_id" "admission_source_id"     
##  [9] "time_in_hospital"         "num_lab_procedures"      
## [11] "num_procedures"           "num_medications"         
## [13] "number_outpatient"        "number_emergency"        
## [15] "number_inpatient"         "diag_1"                  
## [17] "diag_2"                   "diag_3"                  
## [19] "number_diagnoses"         "max_glu_serum"           
## [21] "A1Cresult"                "metformin"               
## [23] "repaglinide"              "nateglinide"             
## [25] "glimepiride"              "glipizide"               
## [27] "glyburide"                "pioglitazone"            
## [29] "rosiglitazone"            "insulin"                 
## [31] "change"                   "diabetesMed"             
## [33] "readmitted"
dim(data1)  # 101766 x 33
## [1] 101766     33
summary(data1)
##   encounter_id        patient_nbr                     race      
##  Min.   :    12522   Min.   :      135   ?              : 2273  
##  1st Qu.: 84961194   1st Qu.: 23413221   AfricanAmerican:19210  
##  Median :152388987   Median : 45505143   Asian          :  641  
##  Mean   :165201646   Mean   : 54330401   Caucasian      :76099  
##  3rd Qu.:230270888   3rd Qu.: 87545950   Hispanic       : 2037  
##  Max.   :443867222   Max.   :189502619   Other          : 1506  
##                                                                 
##              gender           age        admission_type_id
##  Female         :54708   [70-80):26068   Min.   :1.000    
##  Male           :47055   [60-70):22483   1st Qu.:1.000    
##  Unknown/Invalid:    3   [50-60):17256   Median :1.000    
##                          [80-90):17197   Mean   :2.024    
##                          [40-50): 9685   3rd Qu.:3.000    
##                          [30-40): 3775   Max.   :8.000    
##                          (Other): 5302                    
##  discharge_disposition_id admission_source_id time_in_hospital
##  Min.   : 1.000           Min.   : 1.000      Min.   : 1.000  
##  1st Qu.: 1.000           1st Qu.: 1.000      1st Qu.: 2.000  
##  Median : 1.000           Median : 7.000      Median : 4.000  
##  Mean   : 3.716           Mean   : 5.754      Mean   : 4.396  
##  3rd Qu.: 4.000           3rd Qu.: 7.000      3rd Qu.: 6.000  
##  Max.   :28.000           Max.   :25.000      Max.   :14.000  
##                                                               
##  num_lab_procedures num_procedures num_medications number_outpatient
##  Min.   :  1.0      Min.   :0.00   Min.   : 1.00   Min.   : 0.0000  
##  1st Qu.: 31.0      1st Qu.:0.00   1st Qu.:10.00   1st Qu.: 0.0000  
##  Median : 44.0      Median :1.00   Median :15.00   Median : 0.0000  
##  Mean   : 43.1      Mean   :1.34   Mean   :16.02   Mean   : 0.3694  
##  3rd Qu.: 57.0      3rd Qu.:2.00   3rd Qu.:20.00   3rd Qu.: 0.0000  
##  Max.   :132.0      Max.   :6.00   Max.   :81.00   Max.   :42.0000  
##                                                                     
##  number_emergency  number_inpatient      diag_1          diag_2     
##  Min.   : 0.0000   Min.   : 0.0000   428    : 6862   276    : 6752  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   414    : 6581   428    : 6662  
##  Median : 0.0000   Median : 0.0000   786    : 4016   250    : 6071  
##  Mean   : 0.1978   Mean   : 0.6356   410    : 3614   427    : 5036  
##  3rd Qu.: 0.0000   3rd Qu.: 1.0000   486    : 3508   401    : 3736  
##  Max.   :76.0000   Max.   :21.0000   427    : 2766   496    : 3305  
##                                      (Other):74419   (Other):70204  
##      diag_3      number_diagnoses max_glu_serum A1Cresult   
##  250    :11555   Min.   : 1.000   >200: 1485    >7  : 3812  
##  401    : 8289   1st Qu.: 6.000   >300: 1264    >8  : 8216  
##  276    : 5175   Median : 8.000   None:96420    None:84748  
##  428    : 4577   Mean   : 7.423   Norm: 2597    Norm: 4990  
##  427    : 3955   3rd Qu.: 9.000                             
##  414    : 3664   Max.   :16.000                             
##  (Other):64551                                              
##   metformin     repaglinide     nateglinide     glimepiride   
##  Down  :  575   Down  :    45   Down  :    11   Down  :  194  
##  No    :81778   No    :100227   No    :101063   No    :96575  
##  Steady:18346   Steady:  1384   Steady:   668   Steady: 4670  
##  Up    : 1067   Up    :   110   Up    :    24   Up    :  327  
##                                                               
##                                                               
##                                                               
##   glipizide      glyburide     pioglitazone   rosiglitazone 
##  Down  :  560   Down  :  564   Down  :  118   Down  :   87  
##  No    :89080   No    :91116   No    :94438   No    :95401  
##  Steady:11356   Steady: 9274   Steady: 6976   Steady: 6100  
##  Up    :  770   Up    :  812   Up    :  234   Up    :  178  
##                                                             
##                                                             
##                                                             
##    insulin      change     diabetesMed readmitted 
##  Down  :12218   Ch:47011   No :23403   <30:11357  
##  No    :47383   No:54755   Yes:78363   >30:35545  
##  Steady:30849                          NO :54864  
##  Up    :11316                                     
##                                                   
##                                                   
## 
# <<<<<<<<<< NA VALUES >>>>>>>>>
sum(is.na(data1))
## [1] 0
# show how many NA values in each column
sapply(data1, function(x) sum(is.na(x)))  # no 0 values
##             encounter_id              patient_nbr                     race 
##                        0                        0                        0 
##                   gender                      age        admission_type_id 
##                        0                        0                        0 
## discharge_disposition_id      admission_source_id         time_in_hospital 
##                        0                        0                        0 
##       num_lab_procedures           num_procedures          num_medications 
##                        0                        0                        0 
##        number_outpatient         number_emergency         number_inpatient 
##                        0                        0                        0 
##                   diag_1                   diag_2                   diag_3 
##                        0                        0                        0 
##         number_diagnoses            max_glu_serum                A1Cresult 
##                        0                        0                        0 
##                metformin              repaglinide              nateglinide 
##                        0                        0                        0 
##              glimepiride                glipizide                glyburide 
##                        0                        0                        0 
##             pioglitazone            rosiglitazone                  insulin 
##                        0                        0                        0 
##                   change              diabetesMed               readmitted 
##                        0                        0                        0

1.0.0.1 Variables of interest

1.0.0.1.1 Readmitted
summary(data1$readmitted)
##   <30   >30    NO 
## 11357 35545 54864
# <30 >30 NO 11357 35545 54864
1.0.0.1.2 Race
# variables of interest
summary(data1$race)  # boxplot readmit by race
##               ? AfricanAmerican           Asian       Caucasian 
##            2273           19210             641           76099 
##        Hispanic           Other 
##            2037            1506
# filter by race (AfricanAmerican, Asian, Caucasian, Hispanic, Other) &&
# ------ AfricanAmerican ----
readmit_less30.afamer <- filter(data1, race == "AfricanAmerican", readmitted == 
    "<30")
dim(readmit_less30.afamer)  # 2155
## [1] 2155   33
readmit_more30.afamer <- filter(data1, race == "AfricanAmerican", readmitted == 
    ">30")
dim(readmit_more30.afamer)  # 6634
## [1] 6634   33
readmit_none.afamer <- filter(data1, race == "AfricanAmerican", readmitted == 
    "NO")
dim(readmit_none.afamer)  # 10421
## [1] 10421    33
slices.afamer <- c(2155, 6634, 10421)
lbls.afamer <- c("<30", ">30", "none")
pct.afamer <- round(slices.afamer/sum(slices.afamer) * 100)
lbls.afamer <- paste(lbls.afamer, "-(", pct.afamer, ")")  # add percents to labels 
lbls.afamer <- paste(lbls.afamer, "%", sep = "")  # ad % to labels 

# ---- ASIAN ----
readmit_less30.asian <- filter(data1, race == "Asian", readmitted == "<30")
dim(readmit_less30.asian)  # 65
## [1] 65 33
readmit_more30.asian <- filter(data1, race == "Asian", readmitted == ">30")
dim(readmit_more30.asian)  # 161
## [1] 161  33
readmit_none.asian <- filter(data1, race == "Asian", readmitted == "NO")
dim(readmit_none.asian)  # 415
## [1] 415  33
slices.asian <- c(65, 161, 415)
lbls.asian <- c("<30", ">30", "none")
pct.asian <- round(slices.asian/sum(slices.asian) * 100)
lbls.asian <- paste(lbls.asian, "-(", pct.asian, ")")  # add percents to labels 
lbls.asian <- paste(lbls.asian, "%", sep = "")  # ad % to labels 

# ---- CAUCASIAN ----
readmit_less30.cau <- filter(data1, race == "Caucasian", readmitted == "<30")
dim(readmit_less30.cau)  # 8592
## [1] 8592   33
readmit_more30.cau <- filter(data1, race == "Caucasian", readmitted == ">30")
dim(readmit_more30.cau)  # 27124
## [1] 27124    33
readmit_none.cau <- filter(data1, race == "Caucasian", readmitted == "NO")
dim(readmit_none.cau)  # 40383
## [1] 40383    33
slices.cau <- c(8592, 27124, 40383)  #76099 total
lbls.cau <- c("<30", ">30", "none")
pct.cau <- round(slices.cau/sum(slices.cau) * 100)
lbls.cau <- paste(lbls.cau, "-(", pct.cau, ")")  # add percents to labels 
lbls.cau <- paste(lbls.cau, "%", sep = "")  # ad % to labels 

# ---- HISPANIC ----
readmit_less30.hisp <- filter(data1, race == "Hispanic", readmitted == "<30")
dim(readmit_less30.hisp)  # 212
## [1] 212  33
readmit_more30.hisp <- filter(data1, race == "Hispanic", readmitted == ">30")
dim(readmit_more30.hisp)  # 27124
## [1] 642  33
readmit_none.hisp <- filter(data1, race == "Hispanic", readmitted == "NO")
dim(readmit_none.hisp)  # 40383
## [1] 1183   33
slices.hisp <- c(212, 642, 1183)  #76099 total
lbls.hisp <- c("<30", ">30", "none")
pct.hisp <- round(slices.hisp/sum(slices.hisp) * 100)
lbls.hisp <- paste(lbls.hisp, "-(", pct.hisp, ")")  # add percents to labels 
lbls.hisp <- paste(lbls.hisp, "%", sep = "")  # ad % to labels 

# ---- OTHER ----
readmit_less30.oth <- filter(data1, race == "Other", readmitted == "<30")
dim(readmit_less30.oth)  # 145
## [1] 145  33
readmit_more30.oth <- filter(data1, race == "Other", readmitted == ">30")
dim(readmit_more30.oth)  # 446
## [1] 446  33
readmit_none.oth <- filter(data1, race == "Other", readmitted == "NO")
dim(readmit_none.oth)  # 915
## [1] 915  33
slices.oth <- c(145, 446, 915)
lbls.oth <- c("<30", ">30", "none")
pct.oth <- round(slices.oth/sum(slices.oth) * 100)
lbls.oth <- paste(lbls.oth, "-(", pct.oth, ")")  # add percents to labels 
lbls.oth <- paste(lbls.oth, "%", sep = "")  # ad % to labels 



par(mfrow = c(3, 2))
pie(slices.afamer, labels = lbls.afamer, col = rainbow(length(lbls.afamer)), 
    main = "Pie Chart of African American Readmits")
pie(slices.asian, labels = lbls.asian, col = rainbow(length(lbls.asian)), main = "Pie Chart of Asian Readmits")
pie(slices.cau, labels = lbls.cau, col = rainbow(length(lbls.cau)), main = "Pie Chart of Caucasian Readmits")
pie(slices.hisp, labels = lbls.hisp, col = rainbow(length(lbls.hisp)), main = "Pie Chart of Hispanic Readmits")
pie(slices.oth, labels = lbls.oth, col = rainbow(length(lbls.hisp)), main = "Pie Chart of Other Races Readmits")
1.0.0.1.3 Gender
summary(data1$gender)  #boxplot
##          Female            Male Unknown/Invalid 
##           54708           47055               3
# Female Male Unknown/Invalid 54708 47055 3
readmit_less30.gender <- filter(data1, readmitted == "<30")
dim(readmit_less30.gender)  # 11357 total observations
## [1] 11357    33
dim(filter(readmit_less30.gender, gender == "Female"))  #6152 female ~54% of <30 dataset, 11.2% of females of total dataset
## [1] 6152   33
dim(filter(readmit_less30.gender, gender == "Male"))  #5205 male, 45% of <30 dataset, 11.1% of males of total dataset
## [1] 5205   33
readmit_more30.gender <- filter(data1, readmitted == ">30")

nrow(readmit_more30.gender)  #35545 total observations
## [1] 35545
nrow(filter(readmit_more30.gender, gender == "Female"))  #19518 female ~54% of >30 dataset, 35.7% of females of total dataset
## [1] 19518
nrow(filter(readmit_more30.gender, gender == "Male"))  #16027 male, 45%, 34.1% of males of total dataset
## [1] 16027
perc.female <- (19518/35545)
perc.female  #0.5491068
## [1] 0.5491068
perc.male <- (16027/35545)
perc.male  # 0.4508932
## [1] 0.4508932
par(mfrow = c(2, 2))
# nrow(which(readmit_less30.gender == 'Female'))
# nrow(filter(readmit_less30.gender, gender == 'Female'))
# nrow(readmit_less30.gender) x.perc.gender <-
# c(nrow(filter(readmit_less30.gender, gender ==
# 'Female'))/nrow(readmit_less30.gender), nrow(filter(readmit_less30.gender,
# gender == 'Male'))/nrow(readmit_less30.gender)) x.perc.gender
ggplot(readmit_less30.gender) + geom_bar(aes(x = gender), fill = "blue") + labs(title = "Histogram of readmits in less than 30 days (<30) by gender", 
    x = "Gender", y = "Frequency")
ggplot(readmit_more30.gender) + geom_bar(aes(x = gender), fill = "blue") + labs(title = "Histogram of readmits in more than 30 days (>30) by gender", 
    x = "Gender", y = "Frequency")

In the cleaned dataset we have 54708 female observations and 47055 male observations, which means roughly 54% of the patients under consideration were female (for all readmission categories), while ~46% were male. When comparing hospital readmits striated by gender, of the patients that were readmitted in under 30 days approximately 54% (6152/11357) were female, matching the overall female representation. Similarly, of patients that were readmitted over 30 days again 54% (19518/35545) were female. It’s worth noting that the total number of patients (male & female) readmitted over 30 days is about 3 times that of those readmitted in less than 30 days.

There seems to be a gap between genders here implying that women are more prone to readmission, but this is quickly rebuked when we compare the genders in terms of their total observations. For patients who were readmitted in less than 30 days, female patients represent 11.2% (6152/54708) of the total female population, while those who are male represent a similar 11.1% (5205/47055) of the overall male population. The same is true for patients readmitted over 30 days: female patients account for 35.7% (19518/54708) of the total female population, while male patients comprise 34.1% (16027/47055) of the total male population.

This lends credence to the notion that gender does not contribute to likelihood of readmission.

1.0.0.1.4 Age
summary(data1$age)  #scatterplot
##   [0-10)  [10-20)  [20-30)  [30-40)  [40-50)  [50-60)  [60-70)  [70-80) 
##      161      691     1657     3775     9685    17256    22483    26068 
##  [80-90) [90-100) 
##    17197     2793
# <<< SCATTERPLOT WITH LS LINE ADDED >>>>>
lm.age <- lm(readmitted ~ age, data = data1)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
plot(data1$age, data1$readmitted, pch = 16, xlab = "Patient age", ylab = "Readmission category", 
    main = "Patient age vs. readmission category")
abline(lm.age, col = "red", lwd = 4)
## Warning in abline(lm.age, col = "red", lwd = 4): only using the first two
## of 10 regression coefficients
# abline(h=mean(county_data$dem12_frac), lwd=5, col='blue')

It appears that the categories with the largest number of readmits is 70-80 and 80-90, which are almost identical. An interesting trend that we see is that the 20-30 age group has the overall highest readmit frequency under 30 days, which is surprising.

1.0.0.1.5 Change (in diabetes medication)
summary(data1$change)  #boxplot - change in diabetes medication
##    Ch    No 
## 47011 54755
# Ch No 47011 54755

# <30 readmit patients
readmit_less30.change <- filter(data1, readmitted == "<30")
dim(readmit_less30.change)  # 11357 total observations
## [1] 11357    33
dim(filter(readmit_less30.change, change == "Ch"))  #5558 patients with a change of med readmitted <30 days, 48.9% of all patients readmitted <30, 11.8% of all patients with a change in meds
## [1] 5558   33
dim(filter(readmit_less30.change, change == "No"))  #5799 patients with NO change in meds readmitted <30 days, 51.1% of all patients readmitted <30, 10.6% of all patients with NO change in meds
## [1] 5799   33
# >30 readmit patients
readmit_more30.change <- filter(data1, readmitted == ">30")
dim(readmit_more30.change)  #35545 observations
## [1] 35545    33
dim(filter(readmit_more30.change, change == "Ch"))  #17272
## [1] 17272    33
perc.readmit_more30.ch <- 17272/35545
perc.readmit_more30.ch  #0.4859193
## [1] 0.4859193
perc.all.ch <- 17272/47011
perc.all.ch  #0.3674034
## [1] 0.3674034
dim(filter(readmit_more30.change, change == "No"))  #18273
## [1] 18273    33
perc.readmit_more30.no <- 18273/35545
perc.readmit_more30.no  #0.5140807
## [1] 0.5140807
perc.all.no <- 18273/54755
perc.all.no  #0.3337229
## [1] 0.3337229
# pie charts
par(mfrow = c(2, 1))
slices.change <- c(5558, 5799)
lbls.change <- c("change in medication", "no change in medication")
pct.change <- round(slices.change/sum(slices.change) * 100)
lbls.change <- paste(lbls.change, "-(", pct.change, ")")  # add percents to labels 
lbls.change <- paste(lbls.change, "%", sep = "")  # ad % to labels 
pie(slices.change, labels = lbls.change, col = rainbow(length(lbls.change)), 
    main = "Pie Chart of change in diabetes medication status for patients readmitted <30 days")
slices.nochange <- c(17272, 18273)
lbls.nochange <- c("change in medication", "no change in medication")
pct.nochange <- round(slices.nochange/sum(slices.nochange) * 100)
lbls.nochange <- paste(lbls.nochange, "-(", pct.nochange, ")")  # add percents to labels 
lbls.nochange <- paste(lbls.nochange, "%", sep = "")  # ad % to labels 
pie(slices.nochange, labels = lbls.nochange, col = rainbow(length(lbls.nochange)), 
    main = "Pie Chart of change in diabetes medication status for patients readmitted >30 days")
1.0.0.1.6 Number of diagnosis
summary(data1$number_diagnoses)  #bar plot
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   6.000   8.000   7.423   9.000  16.000
readmit_less30.diag <- filter(data1, readmitted == "<30")
hist(readmit_less30.diag$number_diagnoses)
readmit_more30.diag <- filter(data1, readmitted == ">30")
hist(readmit_more30.diag$number_diagnoses)

There consistently seems to be a large spike in frequency around 9 diagnoses.

2 Model building

2.0.0.1 Investigating colinearlity

Some of the features provided seem like they might be highly correlated with one another. In particular, diagnoses & health indicator information, as well as hospital visit information.

# hospital visit info
data1 %>% select_if(is.numeric) %>% select(time_in_hospital, num_lab_procedures, 
    num_procedures, num_medications, number_outpatient, number_emergency, number_inpatient, 
    number_diagnoses) %>% pairs()  # base pair-wise scatter plots

2.0.0.2 Further prepping data

# <<<< make readmitted categorical>>> ======= readmitted = column 33 =======
summary(data1)
##   encounter_id        patient_nbr                     race      
##  Min.   :    12522   Min.   :      135   ?              : 2273  
##  1st Qu.: 84961194   1st Qu.: 23413221   AfricanAmerican:19210  
##  Median :152388987   Median : 45505143   Asian          :  641  
##  Mean   :165201646   Mean   : 54330401   Caucasian      :76099  
##  3rd Qu.:230270888   3rd Qu.: 87545950   Hispanic       : 2037  
##  Max.   :443867222   Max.   :189502619   Other          : 1506  
##                                                                 
##              gender           age        admission_type_id
##  Female         :54708   [70-80):26068   Min.   :1.000    
##  Male           :47055   [60-70):22483   1st Qu.:1.000    
##  Unknown/Invalid:    3   [50-60):17256   Median :1.000    
##                          [80-90):17197   Mean   :2.024    
##                          [40-50): 9685   3rd Qu.:3.000    
##                          [30-40): 3775   Max.   :8.000    
##                          (Other): 5302                    
##  discharge_disposition_id admission_source_id time_in_hospital
##  Min.   : 1.000           Min.   : 1.000      Min.   : 1.000  
##  1st Qu.: 1.000           1st Qu.: 1.000      1st Qu.: 2.000  
##  Median : 1.000           Median : 7.000      Median : 4.000  
##  Mean   : 3.716           Mean   : 5.754      Mean   : 4.396  
##  3rd Qu.: 4.000           3rd Qu.: 7.000      3rd Qu.: 6.000  
##  Max.   :28.000           Max.   :25.000      Max.   :14.000  
##                                                               
##  num_lab_procedures num_procedures num_medications number_outpatient
##  Min.   :  1.0      Min.   :0.00   Min.   : 1.00   Min.   : 0.0000  
##  1st Qu.: 31.0      1st Qu.:0.00   1st Qu.:10.00   1st Qu.: 0.0000  
##  Median : 44.0      Median :1.00   Median :15.00   Median : 0.0000  
##  Mean   : 43.1      Mean   :1.34   Mean   :16.02   Mean   : 0.3694  
##  3rd Qu.: 57.0      3rd Qu.:2.00   3rd Qu.:20.00   3rd Qu.: 0.0000  
##  Max.   :132.0      Max.   :6.00   Max.   :81.00   Max.   :42.0000  
##                                                                     
##  number_emergency  number_inpatient      diag_1          diag_2     
##  Min.   : 0.0000   Min.   : 0.0000   428    : 6862   276    : 6752  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   414    : 6581   428    : 6662  
##  Median : 0.0000   Median : 0.0000   786    : 4016   250    : 6071  
##  Mean   : 0.1978   Mean   : 0.6356   410    : 3614   427    : 5036  
##  3rd Qu.: 0.0000   3rd Qu.: 1.0000   486    : 3508   401    : 3736  
##  Max.   :76.0000   Max.   :21.0000   427    : 2766   496    : 3305  
##                                      (Other):74419   (Other):70204  
##      diag_3      number_diagnoses max_glu_serum A1Cresult   
##  250    :11555   Min.   : 1.000   >200: 1485    >7  : 3812  
##  401    : 8289   1st Qu.: 6.000   >300: 1264    >8  : 8216  
##  276    : 5175   Median : 8.000   None:96420    None:84748  
##  428    : 4577   Mean   : 7.423   Norm: 2597    Norm: 4990  
##  427    : 3955   3rd Qu.: 9.000                             
##  414    : 3664   Max.   :16.000                             
##  (Other):64551                                              
##   metformin     repaglinide     nateglinide     glimepiride   
##  Down  :  575   Down  :    45   Down  :    11   Down  :  194  
##  No    :81778   No    :100227   No    :101063   No    :96575  
##  Steady:18346   Steady:  1384   Steady:   668   Steady: 4670  
##  Up    : 1067   Up    :   110   Up    :    24   Up    :  327  
##                                                               
##                                                               
##                                                               
##   glipizide      glyburide     pioglitazone   rosiglitazone 
##  Down  :  560   Down  :  564   Down  :  118   Down  :   87  
##  No    :89080   No    :91116   No    :94438   No    :95401  
##  Steady:11356   Steady: 9274   Steady: 6976   Steady: 6100  
##  Up    :  770   Up    :  812   Up    :  234   Up    :  178  
##                                                             
##                                                             
##                                                             
##    insulin      change     diabetesMed readmitted 
##  Down  :12218   Ch:47011   No :23403   <30:11357  
##  No    :47383   No:54755   Yes:78363   >30:35545  
##  Steady:30849                          NO :54864  
##  Up    :11316                                     
##                                                   
##                                                   
## 
names(data1)
##  [1] "encounter_id"             "patient_nbr"             
##  [3] "race"                     "gender"                  
##  [5] "age"                      "admission_type_id"       
##  [7] "discharge_disposition_id" "admission_source_id"     
##  [9] "time_in_hospital"         "num_lab_procedures"      
## [11] "num_procedures"           "num_medications"         
## [13] "number_outpatient"        "number_emergency"        
## [15] "number_inpatient"         "diag_1"                  
## [17] "diag_2"                   "diag_3"                  
## [19] "number_diagnoses"         "max_glu_serum"           
## [21] "A1Cresult"                "metformin"               
## [23] "repaglinide"              "nateglinide"             
## [25] "glimepiride"              "glipizide"               
## [27] "glyburide"                "pioglitazone"            
## [29] "rosiglitazone"            "insulin"                 
## [31] "change"                   "diabetesMed"             
## [33] "readmitted"
summary(data1$readmitted)  # needs to be changed to categorical
##   <30   >30    NO 
## 11357 35545 54864
data1$readmitted <- factor(ifelse(data1$readmitted == "<30", "1", "0"))  # if it's a less than 30 day readmit make it 1, else make it zero
summary(data1$readmitted)  # agrees with prior numbers
##     0     1 
## 90409 11357
# 0 1 90409 11357

# <<<<<< remove all ? entries from race >>>>>> summary(data1$race) #2273 ?
# values -- remove them nrow(data1) #101766
data1 <- filter(data1, race != "?")
# nrow(data1) #99493 --> 99493+2273 = 101766 summary(data1$race)

# <<<<<<< remove patient identifiers (not helpful) >>>>> encounter_id &
# patient_nbr
data1 <- subset(data1, select = -c(encounter_id, patient_nbr))
names(data1)
##  [1] "race"                     "gender"                  
##  [3] "age"                      "admission_type_id"       
##  [5] "discharge_disposition_id" "admission_source_id"     
##  [7] "time_in_hospital"         "num_lab_procedures"      
##  [9] "num_procedures"           "num_medications"         
## [11] "number_outpatient"        "number_emergency"        
## [13] "number_inpatient"         "diag_1"                  
## [15] "diag_2"                   "diag_3"                  
## [17] "number_diagnoses"         "max_glu_serum"           
## [19] "A1Cresult"                "metformin"               
## [21] "repaglinide"              "nateglinide"             
## [23] "glimepiride"              "glipizide"               
## [25] "glyburide"                "pioglitazone"            
## [27] "rosiglitazone"            "insulin"                 
## [29] "change"                   "diabetesMed"             
## [31] "readmitted"
# df <- subset(df, select = -c(a,c) )

# filtering/ categorical data w/ if/else statement readmit_less30.diag <-
# filter(data1, readmitted == '<30') bill.data.train$status <-
# factor(ifelse(bill.data.train$status=='bill:passed' |
# bill.data.train$status=='governor:signed' |
# bill.data.train$status=='governor:received', '1', '0'))

2.0.0.3 Initial modeling

# an initial linear model using probably variables
# summary(data1$max_glu_serum) summary(data1$insulin)
summary(data1$diabetesMed)
##    No   Yes 
## 23001 76492
lm.init <- glm(readmitted ~ race + gender + age + time_in_hospital + num_medications + 
    number_diagnoses + max_glu_serum + insulin + change + diabetesMed, data = data1, 
    family = "binomial")
summary(lm.init)
## 
## Call:
## glm(formula = readmitted ~ race + gender + age + time_in_hospital + 
##     num_medications + number_diagnoses + max_glu_serum + insulin + 
##     change + diabetesMed, family = "binomial", data = data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8115  -0.5159  -0.4731  -0.4249   2.8646  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -4.0843697  0.5908671  -6.913 4.76e-12 ***
## raceAsian             -0.0854691  0.1334039  -0.641  0.52173    
## raceCaucasian         -0.0296202  0.0263996  -1.122  0.26187    
## raceHispanic          -0.0644794  0.0764769  -0.843  0.39916    
## raceOther             -0.1946335  0.0907216  -2.145  0.03192 *  
## genderMale            -0.0004398  0.0204318  -0.022  0.98283    
## genderUnknown/Invalid -5.9332246 43.9540598  -0.135  0.89262    
## age[10-20)             1.0403554  0.6055723   1.718  0.08580 .  
## age[20-30)             1.8908809  0.5878383   3.217  0.00130 ** 
## age[30-40)             1.6089258  0.5859724   2.746  0.00604 ** 
## age[40-50)             1.4987686  0.5847586   2.563  0.01038 *  
## age[50-60)             1.3849415  0.5844642   2.370  0.01781 *  
## age[60-70)             1.5235998  0.5843370   2.607  0.00912 ** 
## age[70-80)             1.5778249  0.5842882   2.700  0.00693 ** 
## age[80-90)             1.5847077  0.5844794   2.711  0.00670 ** 
## age[90-100)            1.5012058  0.5871809   2.557  0.01057 *  
## time_in_hospital       0.0257416  0.0037188   6.922 4.45e-12 ***
## num_medications        0.0040770  0.0014423   2.827  0.00470 ** 
## number_diagnoses       0.0657795  0.0059991  10.965  < 2e-16 ***
## max_glu_serum>300      0.0982143  0.1134147   0.866  0.38650    
## max_glu_serumNone     -0.1209673  0.0800668  -1.511  0.13083    
## max_glu_serumNorm     -0.0274302  0.1008995  -0.272  0.78573    
## insulinNo             -0.3440998  0.0393727  -8.740  < 2e-16 ***
## insulinSteady         -0.2879667  0.0360047  -7.998 1.26e-15 ***
## insulinUp             -0.0915777  0.0386579  -2.369  0.01784 *  
## changeNo               0.1443421  0.0287230   5.025 5.03e-07 ***
## diabetesMedYes         0.1551495  0.0325334   4.769 1.85e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 69886  on 99492  degrees of freedom
## Residual deviance: 69245  on 99466  degrees of freedom
## AIC: 69299
## 
## Number of Fisher Scoring iterations: 7
Anova(lm.init)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                  LR Chisq Df Pr(>Chisq)    
## race                5.672  4    0.22499    
## gender              0.139  2    0.93292    
## age                91.422  9  8.438e-16 ***
## time_in_hospital   47.141  1  6.606e-12 ***
## num_medications     7.927  1    0.00487 ** 
## number_diagnoses  123.434  1  < 2.2e-16 ***
## max_glu_serum      10.733  3    0.01326 *  
## insulin            89.473  3  < 2.2e-16 ***
## change             25.424  1  4.602e-07 ***
## diabetesMed        22.736  1  1.859e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

An initial logistic model including logical variables reveals race and gender to add nothing to the model, while other factors (time in hospital, number of medications taken, number of diagnoses, max glucose serum, insulin, change in medication and diabetes medication) are all significant at the .05 level.

# <<<<<<<<<<< LASSO >>>>>>>>>>>>>> <<<<<<CV to select lambda>>>>>>>
str(data1)
## 'data.frame':    99493 obs. of  31 variables:
##  $ race                    : Factor w/ 6 levels "?","AfricanAmerican",..: 4 4 2 4 4 4 4 4 4 4 ...
##  $ gender                  : Factor w/ 3 levels "Female","Male",..: 1 1 1 2 2 2 2 2 1 1 ...
##  $ age                     : Factor w/ 10 levels "[0-10)","[10-20)",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ admission_type_id       : int  6 1 1 1 1 2 3 1 2 3 ...
##  $ discharge_disposition_id: int  25 1 1 1 1 1 1 1 1 3 ...
##  $ admission_source_id     : int  1 7 7 7 7 2 2 7 4 4 ...
##  $ time_in_hospital        : int  1 3 2 2 1 3 4 5 13 12 ...
##  $ num_lab_procedures      : int  41 59 11 44 51 31 70 73 68 33 ...
##  $ num_procedures          : int  0 0 5 1 0 6 1 0 2 3 ...
##  $ num_medications         : int  1 18 13 16 8 16 21 12 28 18 ...
##  $ number_outpatient       : int  0 0 2 0 0 0 0 0 0 0 ...
##  $ number_emergency        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient        : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ diag_1                  : Factor w/ 717 levels "?","10","11",..: 126 145 456 556 56 265 265 278 254 284 ...
##  $ diag_2                  : Factor w/ 749 levels "?","11","110",..: 1 81 80 99 26 248 248 316 262 48 ...
##  $ diag_3                  : Factor w/ 790 levels "?","11","110",..: 1 123 768 250 88 88 772 88 231 319 ...
##  $ number_diagnoses        : int  1 9 6 7 5 9 7 8 8 8 ...
##  $ max_glu_serum           : Factor w/ 4 levels ">200",">300",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ A1Cresult               : Factor w/ 4 levels ">7",">8","None",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ metformin               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
##  $ repaglinide             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ nateglinide             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ glimepiride             : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
##  $ glipizide               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 3 2 3 2 2 2 3 2 ...
##  $ glyburide               : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 3 2 2 ...
##  $ pioglitazone            : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ rosiglitazone           : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 3 ...
##  $ insulin                 : Factor w/ 4 levels "Down","No","Steady",..: 2 4 2 4 3 3 3 2 3 3 ...
##  $ change                  : Factor w/ 2 levels "Ch","No": 2 1 2 1 1 2 1 2 1 1 ...
##  $ diabetesMed             : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 2 2 2 ...
##  $ readmitted              : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
levels(data1$race)
## [1] "?"               "AfricanAmerican" "Asian"           "Caucasian"      
## [5] "Hispanic"        "Other"
set.seed(34)
names(data1)
##  [1] "race"                     "gender"                  
##  [3] "age"                      "admission_type_id"       
##  [5] "discharge_disposition_id" "admission_source_id"     
##  [7] "time_in_hospital"         "num_lab_procedures"      
##  [9] "num_procedures"           "num_medications"         
## [11] "number_outpatient"        "number_emergency"        
## [13] "number_inpatient"         "diag_1"                  
## [15] "diag_2"                   "diag_3"                  
## [17] "number_diagnoses"         "max_glu_serum"           
## [19] "A1Cresult"                "metformin"               
## [21] "repaglinide"              "nateglinide"             
## [23] "glimepiride"              "glipizide"               
## [25] "glyburide"                "pioglitazone"            
## [27] "rosiglitazone"            "insulin"                 
## [29] "change"                   "diabetesMed"             
## [31] "readmitted"
# extract y, readmitted
Y <- data1$readmitted
# Y
X <- model.matrix(readmitted ~ . - diag_1 - diag_2 - diag_3, data = data1)[, 
    -1]
# X
colnames(X)
##  [1] "raceAfricanAmerican"      "raceAsian"               
##  [3] "raceCaucasian"            "raceHispanic"            
##  [5] "raceOther"                "genderMale"              
##  [7] "genderUnknown/Invalid"    "age[10-20)"              
##  [9] "age[20-30)"               "age[30-40)"              
## [11] "age[40-50)"               "age[50-60)"              
## [13] "age[60-70)"               "age[70-80)"              
## [15] "age[80-90)"               "age[90-100)"             
## [17] "admission_type_id"        "discharge_disposition_id"
## [19] "admission_source_id"      "time_in_hospital"        
## [21] "num_lab_procedures"       "num_procedures"          
## [23] "num_medications"          "number_outpatient"       
## [25] "number_emergency"         "number_inpatient"        
## [27] "number_diagnoses"         "max_glu_serum>300"       
## [29] "max_glu_serumNone"        "max_glu_serumNorm"       
## [31] "A1Cresult>8"              "A1CresultNone"           
## [33] "A1CresultNorm"            "metforminNo"             
## [35] "metforminSteady"          "metforminUp"             
## [37] "repaglinideNo"            "repaglinideSteady"       
## [39] "repaglinideUp"            "nateglinideNo"           
## [41] "nateglinideSteady"        "nateglinideUp"           
## [43] "glimepirideNo"            "glimepirideSteady"       
## [45] "glimepirideUp"            "glipizideNo"             
## [47] "glipizideSteady"          "glipizideUp"             
## [49] "glyburideNo"              "glyburideSteady"         
## [51] "glyburideUp"              "pioglitazoneNo"          
## [53] "pioglitazoneSteady"       "pioglitazoneUp"          
## [55] "rosiglitazoneNo"          "rosiglitazoneSteady"     
## [57] "rosiglitazoneUp"          "insulinNo"               
## [59] "insulinSteady"            "insulinUp"               
## [61] "changeNo"                 "diabetesMedYes"
fit.lasso.cv <- cv.glmnet(X, Y, alpha = 1, nfolds = 10, family = "binomial")
fit.lasso.cv$lambda.1se  #0.008129686
## [1] 0.008129686
fit.lasso.cv$lambda.min  # 0.0003773466
## [1] 0.0003773466
# fit.lasso.cv$nzero

# plot(fit.lasso.cv$lambda , main = 'There are 100 lambdas used' , xlab =
# 'Lambda Index' , ylab = 'Lambda Value' ) head(data.frame(
# Cross.Validation.Erorr = fit.lasso.cv$cvm , Lambda = fit.lasso.cv$lambda))
plot(fit.lasso.cv$lambda, fit.lasso.cv$cvm, xlab = expression(lambda), ylab = "mean cv errors")
plot(fit.lasso.cv)
# using λ=lambda.min
coef.min <- coef(fit.lasso.cv, s = "lambda.min")  #s=c('lambda.1se','lambda.min') or lambda value
coef.min <- coef.min[which(coef.min != 0), ]  # get the non=zero coefficients
coef.min  # the set of predictors chosen
##              (Intercept)                raceOther               genderMale 
##            -2.8097888694            -0.0554280330             0.0085184650 
##               age[10-20)               age[40-50)               age[50-60) 
##            -0.4299743773            -0.0648807009            -0.1340123827 
##               age[70-80)               age[80-90)              age[90-100) 
##             0.0417078492             0.0392092314            -0.0205490554 
##        admission_type_id discharge_disposition_id      admission_source_id 
##            -0.0175199817             0.0235694307            -0.0060412653 
##         time_in_hospital       num_lab_procedures           num_procedures 
##             0.0149442927             0.0005622261            -0.0207667319 
##          num_medications         number_emergency         number_inpatient 
##             0.0042996983             0.0342383325             0.2589845454 
##         number_diagnoses        max_glu_serumNone            A1CresultNone 
##             0.0420158104            -0.1190377023             0.0868206666 
##            A1CresultNorm          metforminSteady              metforminUp 
##            -0.0022275961            -0.1190003473            -0.2414481452 
##        repaglinideSteady            repaglinideUp        nateglinideSteady 
##             0.0253972517             0.4042658898             0.0216907292 
##            nateglinideUp        glimepirideSteady              glipizideNo 
##            -0.6337575431            -0.1174192521            -0.0196914037 
##              glipizideUp              glyburideNo       pioglitazoneSteady 
##             0.1116018770             0.0254227776            -0.0531719832 
##          rosiglitazoneNo                insulinNo            insulinSteady 
##             0.0482183707            -0.1023942925            -0.1031935972 
##                insulinUp           diabetesMedYes 
##            -0.0399687368             0.1900698122
rownames(as.matrix(coef.min))  # shows only names, not estimates -- 38 variables
##  [1] "(Intercept)"              "raceOther"               
##  [3] "genderMale"               "age[10-20)"              
##  [5] "age[40-50)"               "age[50-60)"              
##  [7] "age[70-80)"               "age[80-90)"              
##  [9] "age[90-100)"              "admission_type_id"       
## [11] "discharge_disposition_id" "admission_source_id"     
## [13] "time_in_hospital"         "num_lab_procedures"      
## [15] "num_procedures"           "num_medications"         
## [17] "number_emergency"         "number_inpatient"        
## [19] "number_diagnoses"         "max_glu_serumNone"       
## [21] "A1CresultNone"            "A1CresultNorm"           
## [23] "metforminSteady"          "metforminUp"             
## [25] "repaglinideSteady"        "repaglinideUp"           
## [27] "nateglinideSteady"        "nateglinideUp"           
## [29] "glimepirideSteady"        "glipizideNo"             
## [31] "glipizideUp"              "glyburideNo"             
## [33] "pioglitazoneSteady"       "rosiglitazoneNo"         
## [35] "insulinNo"                "insulinSteady"           
## [37] "insulinUp"                "diabetesMedYes"
# using λ=lambda.1se coef.1se <- coef(fit.lasso.cv, s='lambda.1se') coef.1se
# <- coef.1se[which(coef.1se !=0),] coef.1se rownames(as.matrix(coef.1se)) #
# only 4 variables -- too sparse

# using all non-zero coefficients coef.nzero <-coef(fit.lasso.cv, nzero = 3)
# coef.nzero <- coef.nzero[which(coef.nzero !=0), ]
# rownames(as.matrix(coef.nzero))

# final-- using lambda.min
coef.min <- coef(fit.lasso.cv, s = "lambda.min")  #s=c('lambda.1se','lambda.min') or lambda value
coef.min <- coef.min[which(coef.min != 0), ]  # get the non=zero coefficients
coef.min
##              (Intercept)                raceOther               genderMale 
##            -2.8097888694            -0.0554280330             0.0085184650 
##               age[10-20)               age[40-50)               age[50-60) 
##            -0.4299743773            -0.0648807009            -0.1340123827 
##               age[70-80)               age[80-90)              age[90-100) 
##             0.0417078492             0.0392092314            -0.0205490554 
##        admission_type_id discharge_disposition_id      admission_source_id 
##            -0.0175199817             0.0235694307            -0.0060412653 
##         time_in_hospital       num_lab_procedures           num_procedures 
##             0.0149442927             0.0005622261            -0.0207667319 
##          num_medications         number_emergency         number_inpatient 
##             0.0042996983             0.0342383325             0.2589845454 
##         number_diagnoses        max_glu_serumNone            A1CresultNone 
##             0.0420158104            -0.1190377023             0.0868206666 
##            A1CresultNorm          metforminSteady              metforminUp 
##            -0.0022275961            -0.1190003473            -0.2414481452 
##        repaglinideSteady            repaglinideUp        nateglinideSteady 
##             0.0253972517             0.4042658898             0.0216907292 
##            nateglinideUp        glimepirideSteady              glipizideNo 
##            -0.6337575431            -0.1174192521            -0.0196914037 
##              glipizideUp              glyburideNo       pioglitazoneSteady 
##             0.1116018770             0.0254227776            -0.0531719832 
##          rosiglitazoneNo                insulinNo            insulinSteady 
##             0.0482183707            -0.1023942925            -0.1031935972 
##                insulinUp           diabetesMedYes 
##            -0.0399687368             0.1900698122
rownames(as.matrix(coef.min))
##  [1] "(Intercept)"              "raceOther"               
##  [3] "genderMale"               "age[10-20)"              
##  [5] "age[40-50)"               "age[50-60)"              
##  [7] "age[70-80)"               "age[80-90)"              
##  [9] "age[90-100)"              "admission_type_id"       
## [11] "discharge_disposition_id" "admission_source_id"     
## [13] "time_in_hospital"         "num_lab_procedures"      
## [15] "num_procedures"           "num_medications"         
## [17] "number_emergency"         "number_inpatient"        
## [19] "number_diagnoses"         "max_glu_serumNone"       
## [21] "A1CresultNone"            "A1CresultNorm"           
## [23] "metforminSteady"          "metforminUp"             
## [25] "repaglinideSteady"        "repaglinideUp"           
## [27] "nateglinideSteady"        "nateglinideUp"           
## [29] "glimepirideSteady"        "glipizideNo"             
## [31] "glipizideUp"              "glyburideNo"             
## [33] "pioglitazoneSteady"       "rosiglitazoneNo"         
## [35] "insulinNo"                "insulinSteady"           
## [37] "insulinUp"                "diabetesMedYes"
# using LASSO variables to fit an lm() model
var.min <- rownames(as.matrix(coef.min))
var.min
##  [1] "(Intercept)"              "raceOther"               
##  [3] "genderMale"               "age[10-20)"              
##  [5] "age[40-50)"               "age[50-60)"              
##  [7] "age[70-80)"               "age[80-90)"              
##  [9] "age[90-100)"              "admission_type_id"       
## [11] "discharge_disposition_id" "admission_source_id"     
## [13] "time_in_hospital"         "num_lab_procedures"      
## [15] "num_procedures"           "num_medications"         
## [17] "number_emergency"         "number_inpatient"        
## [19] "number_diagnoses"         "max_glu_serumNone"       
## [21] "A1CresultNone"            "A1CresultNorm"           
## [23] "metforminSteady"          "metforminUp"             
## [25] "repaglinideSteady"        "repaglinideUp"           
## [27] "nateglinideSteady"        "nateglinideUp"           
## [29] "glimepirideSteady"        "glipizideNo"             
## [31] "glipizideUp"              "glyburideNo"             
## [33] "pioglitazoneSteady"       "rosiglitazoneNo"         
## [35] "insulinNo"                "insulinSteady"           
## [37] "insulinUp"                "diabetesMedYes"
# names(data1) 'raceOther' 'genderMale' 'age[10-20)' 'age[40-50)'
# 'age[50-60)' [7] 'age[70-80)' 'age[80-90)' 'age[90-100)'
# 'admission_type_id' 'discharge_disposition_id' 'admission_source_id' [13]
# 'time_in_hospital' 'num_lab_procedures' 'num_procedures' 'num_medications'
# 'number_emergency' 'number_inpatient' [19] 'number_diagnoses'
# 'max_glu_serumNone' 'A1CresultNone' 'A1CresultNorm' 'metforminSteady'
# 'metforminUp' [25] 'repaglinideSteady' 'repaglinideUp' 'nateglinideSteady'
# 'nateglinideUp' 'glimepirideSteady' 'glipizideNo' [31] 'glipizideUp'
# 'glyburideNo' 'pioglitazoneSteady' 'rosiglitazoneNo' 'insulinNo'
# 'insulinSteady' [37] 'insulinUp' 'diabetesMedYes'

# lm.input <- as.formula(paste('readmitted', '~', paste(var.min[-1],
# collapse = '+')))
lm.input <- "readmitted~ race + gender + age + time_in_hospital + num_lab_procedures + num_procedures + num_medications + number_emergency + number_inpatient + number_diagnoses + max_glu_serum + A1Cresult + metformin + repaglinide + nateglinide + glimepiride + glipizide + glyburide + pioglitazone + rosiglitazone + insulin + diabetesMed"

fit.min.lm <- glm(lm.input, data = data1, family = "binomial")
lm.output <- coef(fit.min.lm)  # output lm estimates
summary(fit.min.lm)
## 
## Call:
## glm(formula = lm.input, family = "binomial", data = data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5322  -0.4963  -0.4416  -0.3941   2.8657  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -5.3816479  1.4869900  -3.619 0.000296 ***
## raceAsian             -0.0228188  0.1346789  -0.169 0.865457    
## raceCaucasian         -0.0086374  0.0268357  -0.322 0.747557    
## raceHispanic          -0.0241176  0.0773566  -0.312 0.755214    
## raceOther             -0.1040263  0.0915539  -1.136 0.255861    
## genderMale             0.0167169  0.0207789   0.805 0.421100    
## genderUnknown/Invalid -5.9331294 43.9540900  -0.135 0.892624    
## age[10-20)             0.8940673  0.6060015   1.475 0.140117    
## age[20-30)             1.4227249  0.5889612   2.416 0.015707 *  
## age[30-40)             1.4161440  0.5862809   2.415 0.015715 *  
## age[40-50)             1.3444803  0.5850202   2.298 0.021552 *  
## age[50-60)             1.2868988  0.5847136   2.201 0.027743 *  
## age[60-70)             1.4424860  0.5845997   2.467 0.013607 *  
## age[70-80)             1.4967793  0.5845621   2.561 0.010452 *  
## age[80-90)             1.5029480  0.5847638   2.570 0.010165 *  
## age[90-100)            1.4216124  0.5875047   2.420 0.015531 *  
## time_in_hospital       0.0217499  0.0038806   5.605 2.09e-08 ***
## num_lab_procedures     0.0004399  0.0005865   0.750 0.453204    
## num_procedures        -0.0273986  0.0068651  -3.991 6.58e-05 ***
## num_medications        0.0058254  0.0016152   3.607 0.000310 ***
## number_emergency       0.0337583  0.0084927   3.975 7.04e-05 ***
## number_inpatient       0.2596307  0.0065642  39.553  < 2e-16 ***
## number_diagnoses       0.0405081  0.0061306   6.608 3.91e-11 ***
## max_glu_serum>300     -0.0112851  0.1162091  -0.097 0.922639    
## max_glu_serumNone     -0.0720006  0.0815605  -0.883 0.377351    
## max_glu_serumNorm     -0.0386135  0.1018787  -0.379 0.704677    
## A1Cresult>8            0.0256722  0.0674826   0.380 0.703629    
## A1CresultNone          0.1105945  0.0568690   1.945 0.051808 .  
## A1CresultNorm         -0.0059462  0.0736751  -0.081 0.935674    
## metforminNo           -0.1227714  0.1313434  -0.935 0.349924    
## metforminSteady       -0.2573010  0.1329411  -1.935 0.052935 .  
## metforminUp           -0.4079801  0.1727285  -2.362 0.018178 *  
## repaglinideNo          0.7147104  0.6001499   1.191 0.233698    
## repaglinideSteady      0.7468635  0.6053853   1.234 0.217315    
## repaglinideUp          1.1738777  0.6500391   1.806 0.070941 .  
## nateglinideNo          0.2023111  1.0513833   0.192 0.847410    
## nateglinideSteady      0.2466038  1.0582851   0.233 0.815744    
## nateglinideUp         -0.9423263  1.4727213  -0.640 0.522268    
## glimepirideNo         -0.0691762  0.2224389  -0.311 0.755808    
## glimepirideSteady     -0.2289179  0.2274472  -1.006 0.314191    
## glimepirideUp         -0.1140181  0.2859720  -0.399 0.690111    
## glipizideNo           -0.2781310  0.1215778  -2.288 0.022156 *  
## glipizideSteady       -0.2833616  0.1241181  -2.283 0.022430 *  
## glipizideUp           -0.1190406  0.1622031  -0.734 0.463011    
## glyburideNo            0.2195461  0.1481506   1.482 0.138364    
## glyburideSteady        0.1986180  0.1507842   1.317 0.187760    
## glyburideUp            0.1753186  0.1869016   0.938 0.348231    
## pioglitazoneNo        -0.2174042  0.2611622  -0.832 0.405156    
## pioglitazoneSteady    -0.2993717  0.2638911  -1.134 0.256605    
## pioglitazoneUp        -0.1768262  0.3310317  -0.534 0.593226    
## rosiglitazoneNo        0.6892399  0.4635314   1.487 0.137033    
## rosiglitazoneSteady    0.6243854  0.4652818   1.342 0.179612    
## rosiglitazoneUp        0.5300713  0.5271293   1.006 0.314617    
## insulinNo             -0.1153220  0.0385079  -2.995 0.002747 ** 
## insulinSteady         -0.1420041  0.0332700  -4.268 1.97e-05 ***
## insulinUp             -0.0872010  0.0395249  -2.206 0.027368 *  
## diabetesMedYes         0.2047108  0.0381742   5.363 8.21e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 69886  on 99492  degrees of freedom
## Residual deviance: 67362  on 99436  degrees of freedom
## AIC: 67476
## 
## Number of Fisher Scoring iterations: 7
Anova(fit.min.lm)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                    LR Chisq Df Pr(>Chisq)    
## race                   1.38  4  0.8475447    
## gender                 0.79  2  0.6747996    
## age                   73.29  9  3.440e-12 ***
## time_in_hospital      30.99  1  2.594e-08 ***
## num_lab_procedures     0.56  1  0.4530848    
## num_procedures        16.14  1  5.898e-05 ***
## num_medications       12.90  1  0.0003284 ***
## number_emergency      15.62  1  7.742e-05 ***
## number_inpatient    1476.97  1  < 2.2e-16 ***
## number_diagnoses      44.30  1  2.814e-11 ***
## max_glu_serum          1.43  3  0.6989452    
## A1Cresult             11.71  3  0.0084518 ** 
## metformin             27.06  3  5.719e-06 ***
## repaglinide            4.94  3  0.1759255    
## nateglinide            1.92  3  0.5893757    
## glimepiride            9.64  3  0.0218451 *  
## glipizide              6.97  3  0.0729761 .  
## glyburide              2.61  3  0.4563960    
## pioglitazone           4.57  3  0.2059665    
## rosiglitazone          5.15  3  0.1614085    
## insulin               18.21  3  0.0003986 ***
## diabetesMed           28.79  1  8.052e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

A more robust and methodically sound model used LASSO regularization, model building that adds constraints (a penalty) to the coefficients, giving us sparse model selection. This is good given the amount of features (many with multiple levels) that we have. The tuning parameter for this penalty function - \(\lambda\) - was chosen via cross validation. This yielded a model with 22 variables. Creating a model with these 22 variables and running the Anova() test revealed that not all variables were statistically significant. Thus, I began the process of (manual) backwards elimination by kicking out the variable with the largest P value (there is a large chance that the true value is zero (null hypothesis)).

# kick out race first -- highest p value at .89 (null hypoth can't be
# disproven)
fit.min.lm.1 <- update(fit.min.lm, . ~ . - race)
Anova(fit.min.lm.1)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                    LR Chisq Df Pr(>Chisq)    
## gender                 0.75  2  0.6856314    
## age                   73.92  9  2.579e-12 ***
## time_in_hospital      31.29  1  2.223e-08 ***
## num_lab_procedures     0.56  1  0.4547930    
## num_procedures        16.38  1  5.182e-05 ***
## num_medications       13.11  1  0.0002939 ***
## number_emergency      15.56  1  8.013e-05 ***
## number_inpatient    1479.70  1  < 2.2e-16 ***
## number_diagnoses      44.43  1  2.634e-11 ***
## max_glu_serum          1.38  3  0.7106120    
## A1Cresult             11.80  3  0.0081164 ** 
## metformin             27.14  3  5.494e-06 ***
## repaglinide            4.91  3  0.1783463    
## nateglinide            1.92  3  0.5899898    
## glimepiride            9.72  3  0.0211358 *  
## glipizide              6.91  3  0.0749654 .  
## glyburide              2.61  3  0.4555461    
## pioglitazone           4.59  3  0.2047850    
## rosiglitazone          5.12  3  0.1634236    
## insulin               17.92  3  0.0004566 ***
## diabetesMed           28.77  1  8.160e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# kick out gender - .686
fit.min.lm.2 <- update(fit.min.lm.1, . ~ . - gender)
Anova(fit.min.lm.2)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                    LR Chisq Df Pr(>Chisq)    
## age                   73.52  9  3.099e-12 ***
## time_in_hospital      31.14  1  2.397e-08 ***
## num_lab_procedures     0.57  1  0.4492066    
## num_procedures        16.06  1  6.122e-05 ***
## num_medications       12.85  1  0.0003377 ***
## number_emergency      15.44  1  8.504e-05 ***
## number_inpatient    1479.65  1  < 2.2e-16 ***
## number_diagnoses      44.62  1  2.389e-11 ***
## max_glu_serum          1.39  3  0.7070448    
## A1Cresult             11.74  3  0.0083415 ** 
## metformin             27.21  3  5.309e-06 ***
## repaglinide            4.91  3  0.1783611    
## nateglinide            1.92  3  0.5890378    
## glimepiride            9.66  3  0.0216930 *  
## glipizide              6.92  3  0.0744379 .  
## glyburide              2.56  3  0.4645470    
## pioglitazone           4.61  3  0.2028793    
## rosiglitazone          5.08  3  0.1662045    
## insulin               17.98  3  0.0004434 ***
## diabetesMed           28.77  1  8.169e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# kick out max_glu_serum - .71
fit.min.lm.3 <- update(fit.min.lm.2, . ~ . - max_glu_serum)
Anova(fit.min.lm.3)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                    LR Chisq Df Pr(>Chisq)    
## age                   74.32  9  2.149e-12 ***
## time_in_hospital      32.21  1  1.386e-08 ***
## num_lab_procedures     0.32  1  0.5728589    
## num_procedures        16.98  1  3.785e-05 ***
## num_medications       13.29  1  0.0002669 ***
## number_emergency      15.67  1  7.531e-05 ***
## number_inpatient    1480.99  1  < 2.2e-16 ***
## number_diagnoses      43.83  1  3.578e-11 ***
## A1Cresult             11.93  3  0.0076332 ** 
## metformin             27.61  3  4.386e-06 ***
## repaglinide            4.88  3  0.1805475    
## nateglinide            1.91  3  0.5902633    
## glimepiride            9.89  3  0.0195480 *  
## glipizide              6.96  3  0.0733318 .  
## glyburide              2.53  3  0.4693058    
## pioglitazone           4.69  3  0.1963129    
## rosiglitazone          5.08  3  0.1663287    
## insulin               18.40  3  0.0003644 ***
## diabetesMed           28.66  1  8.645e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# kick out nateglinide - 0.5902633
fit.min.lm.4 <- update(fit.min.lm.3, . ~ . - nateglinide)
Anova(fit.min.lm.4)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                    LR Chisq Df Pr(>Chisq)    
## age                   74.44  9  2.037e-12 ***
## time_in_hospital      32.20  1  1.388e-08 ***
## num_lab_procedures     0.31  1  0.5755722    
## num_procedures        17.00  1  3.743e-05 ***
## num_medications       13.28  1  0.0002676 ***
## number_emergency      15.55  1  8.051e-05 ***
## number_inpatient    1481.76  1  < 2.2e-16 ***
## number_diagnoses      43.80  1  3.641e-11 ***
## A1Cresult             11.97  3  0.0074881 ** 
## metformin             27.68  3  4.248e-06 ***
## repaglinide            4.89  3  0.1803641    
## glimepiride            9.88  3  0.0196204 *  
## glipizide              6.97  3  0.0729185 .  
## glyburide              2.54  3  0.4685628    
## pioglitazone           4.69  3  0.1962762    
## rosiglitazone          5.07  3  0.1664375    
## insulin               18.40  3  0.0003645 ***
## diabetesMed           28.81  1  7.971e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# kick out num_lab_procedures -0.5755722
fit.min.lm.5 <- update(fit.min.lm.4, . ~ . - num_lab_procedures)
Anova(fit.min.lm.5)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                  LR Chisq Df Pr(>Chisq)    
## age                 74.48  9  2.005e-12 ***
## time_in_hospital    34.75  1  3.740e-09 ***
## num_procedures      17.23  1  3.309e-05 ***
## num_medications     14.31  1  0.0001550 ***
## number_emergency    15.47  1  8.363e-05 ***
## number_inpatient  1482.76  1  < 2.2e-16 ***
## number_diagnoses    44.50  1  2.545e-11 ***
## A1Cresult           11.73  3  0.0083653 ** 
## metformin           28.04  3  3.568e-06 ***
## repaglinide          4.89  3  0.1799361    
## glimepiride          9.92  3  0.0192825 *  
## glipizide            7.01  3  0.0714712 .  
## glyburide            2.52  3  0.4717495    
## pioglitazone         4.74  3  0.1914788    
## rosiglitazone        5.10  3  0.1647842    
## insulin             18.40  3  0.0003631 ***
## diabetesMed         28.71  1  8.391e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# kick out glyburide- 0.4717495
fit.min.lm.6 <- update(fit.min.lm.5, . ~ . - glyburide)
Anova(fit.min.lm.6)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                  LR Chisq Df Pr(>Chisq)    
## age                 73.68  9  2.878e-12 ***
## time_in_hospital    34.23  1  4.904e-09 ***
## num_procedures      17.15  1  3.447e-05 ***
## num_medications     14.00  1  0.0001826 ***
## number_emergency    15.50  1  8.268e-05 ***
## number_inpatient  1486.00  1  < 2.2e-16 ***
## number_diagnoses    44.93  1  2.040e-11 ***
## A1Cresult           11.89  3  0.0077834 ** 
## metformin           28.04  3  3.571e-06 ***
## repaglinide          4.94  3  0.1762407    
## glimepiride          9.33  3  0.0251810 *  
## glipizide            7.39  3  0.0604617 .  
## pioglitazone         4.61  3  0.2023333    
## rosiglitazone        5.10  3  0.1649434    
## insulin             19.40  3  0.0002259 ***
## diabetesMed         28.87  1  7.729e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# kick out pioglitazone - 0.2023333
fit.min.lm.7 <- update(fit.min.lm.6, . ~ . - pioglitazone)
Anova(fit.min.lm.7)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                  LR Chisq Df Pr(>Chisq)    
## age                 73.76  9  2.772e-12 ***
## time_in_hospital    35.07  1  3.186e-09 ***
## num_procedures      17.01  1  3.710e-05 ***
## num_medications     13.29  1  0.0002667 ***
## number_emergency    15.39  1  8.741e-05 ***
## number_inpatient  1491.28  1  < 2.2e-16 ***
## number_diagnoses    45.11  1  1.867e-11 ***
## A1Cresult           11.80  3  0.0081094 ** 
## metformin           27.84  3  3.930e-06 ***
## repaglinide          4.99  3  0.1723571    
## glimepiride          9.44  3  0.0239910 *  
## glipizide            7.46  3  0.0585190 .  
## rosiglitazone        4.55  3  0.2078855    
## insulin             19.98  3  0.0001714 ***
## diabetesMed         26.39  1  2.784e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# kick out rosiglitazone -0.2078855
fit.min.lm.8 <- update(fit.min.lm.7, . ~ . - rosiglitazone)
Anova(fit.min.lm.8)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                  LR Chisq Df Pr(>Chisq)    
## age                 73.85  9  2.666e-12 ***
## time_in_hospital    34.95  1  3.378e-09 ***
## num_procedures      17.00  1  3.736e-05 ***
## num_medications     12.96  1  0.0003186 ***
## number_emergency    15.48  1  8.333e-05 ***
## number_inpatient  1492.95  1  < 2.2e-16 ***
## number_diagnoses    45.37  1  1.635e-11 ***
## A1Cresult           11.92  3  0.0076604 ** 
## metformin           28.29  3  3.164e-06 ***
## repaglinide          4.97  3  0.1737818    
## glimepiride          9.49  3  0.0234029 *  
## glipizide            7.37  3  0.0609423 .  
## insulin             20.55  3  0.0001308 ***
## diabetesMed         24.87  1  6.138e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# kick out repaglinide - 0.1737818
fit.min.lm.9 <- update(fit.min.lm.8, . ~ . - repaglinide)
Anova(fit.min.lm.9)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                  LR Chisq Df Pr(>Chisq)    
## age                 74.25  9  2.224e-12 ***
## time_in_hospital    35.17  1  3.025e-09 ***
## num_procedures      16.99  1  3.760e-05 ***
## num_medications     12.98  1  0.0003155 ***
## number_emergency    15.55  1  8.030e-05 ***
## number_inpatient  1493.12  1  < 2.2e-16 ***
## number_diagnoses    45.54  1  1.495e-11 ***
## A1Cresult           11.65  3  0.0086970 ** 
## metformin           28.27  3  3.183e-06 ***
## glimepiride          9.65  3  0.0218146 *  
## glipizide            7.34  3  0.0618024 .  
## insulin             20.73  3  0.0001197 ***
## diabetesMed         25.60  1  4.201e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# kick out glipizide- 0.0618024
fit.min.lm.10 <- update(fit.min.lm.9, . ~ . - glipizide)
Anova(fit.min.lm.10)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                  LR Chisq Df Pr(>Chisq)    
## age                 75.00  9  1.583e-12 ***
## time_in_hospital    36.65  1  1.416e-09 ***
## num_procedures      17.20  1  3.363e-05 ***
## num_medications     13.45  1  0.0002449 ***
## number_emergency    15.48  1  8.317e-05 ***
## number_inpatient  1492.62  1  < 2.2e-16 ***
## number_diagnoses    45.23  1  1.750e-11 ***
## A1Cresult           11.35  3  0.0099759 ** 
## metformin           28.52  3  2.819e-06 ***
## glimepiride         10.41  3  0.0153940 *  
## insulin             20.20  3  0.0001546 ***
## diabetesMed         30.21  1  3.871e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# kick out glimepiride -0.0153940 * Glimepiride is a prescription drug. It
# comes as an oral tablet.  Glimepiride is available as the brand-name drug
# Amaryl and as a generic drug. Generic drugs usually cost less. In some
# cases, they may not be available in every strength or form as the
# brand-name version.  This drug may be used as part of a combination
# therapy. That means you need to take it with other drugs.  Glimepiride is
# used to reduce high blood sugar levels in people with type 2 diabetes.
# It’s used in combination with a healthy diet and exercise.  This
# medication may be used with insulin or other types of diabetes drugs to
# help control your high blood sugar.

fit.min.lm.11 <- update(fit.min.lm.10, . ~ . - glimepiride)
Anova(fit.min.lm.11)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                  LR Chisq Df Pr(>Chisq)    
## age                 73.97  9  2.520e-12 ***
## time_in_hospital    37.27  1  1.029e-09 ***
## num_procedures      17.26  1  3.267e-05 ***
## num_medications     12.93  1  0.0003241 ***
## number_emergency    15.19  1  9.709e-05 ***
## number_inpatient  1494.42  1  < 2.2e-16 ***
## number_diagnoses    45.05  1  1.923e-11 ***
## A1Cresult           11.52  3  0.0092159 ** 
## metformin           28.33  3  3.099e-06 ***
## insulin             21.26  3  9.309e-05 ***
## diabetesMed         26.33  1  2.873e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

After doing manual backwards elimination the model has 11 variables, all of which are statistically significant at the .01 level. Some of these remaining variables seem like they might exhibit colinearity. Although plotting them shows that the actuality is really not all that bad.

data1 %>% select(time_in_hospital, num_procedures, num_medications, number_emergency, 
    number_inpatient, number_diagnoses) %>% cor()
##                  time_in_hospital num_procedures num_medications
## time_in_hospital      1.000000000     0.19323405      0.46638083
## num_procedures        0.193234051     1.00000000      0.38553831
## num_medications       0.466380832     0.38553831      1.00000000
## number_emergency     -0.009798542    -0.03836918      0.01296355
## number_inpatient      0.073408368    -0.06584306      0.06499285
## number_diagnoses      0.220686659     0.07233875      0.25860468
##                  number_emergency number_inpatient number_diagnoses
## time_in_hospital     -0.009798542       0.07340837       0.22068666
## num_procedures       -0.038369184      -0.06584306       0.07233875
## num_medications       0.012963548       0.06499285       0.25860468
## number_emergency      1.000000000       0.26638244       0.05408781
## number_inpatient      0.266382440       1.00000000       0.10325182
## number_diagnoses      0.054087810       0.10325182       1.00000000
# <<<<< CORRELATION OF VARIABLES >>>>>>>>>
data1 %>% select_if(is.numeric) %>% select(time_in_hospital, num_procedures, 
    num_medications, number_emergency, number_inpatient, number_diagnoses) %>% 
    ggpairs()
summary(fit.min.lm.11)
## 
## Call:
## glm(formula = readmitted ~ age + time_in_hospital + num_procedures + 
##     num_medications + number_emergency + number_inpatient + number_diagnoses + 
##     A1Cresult + metformin + insulin + diabetesMed, family = "binomial", 
##     data = data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5358  -0.4956  -0.4424  -0.3955   2.8691  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -4.157626   0.602610  -6.899 5.22e-12 ***
## age[10-20)        0.896795   0.605905   1.480 0.138849    
## age[20-30)        1.421427   0.588815   2.414 0.015776 *  
## age[30-40)        1.413644   0.586114   2.412 0.015870 *  
## age[40-50)        1.339672   0.584841   2.291 0.021983 *  
## age[50-60)        1.279925   0.584549   2.190 0.028554 *  
## age[60-70)        1.434274   0.584443   2.454 0.014124 *  
## age[70-80)        1.489393   0.584408   2.549 0.010817 *  
## age[80-90)        1.495828   0.584605   2.559 0.010507 *  
## age[90-100)       1.413535   0.587332   2.407 0.016097 *  
## time_in_hospital  0.023171   0.003769   6.148 7.84e-10 ***
## num_procedures   -0.028107   0.006812  -4.126 3.69e-05 ***
## num_medications   0.005694   0.001578   3.609 0.000307 ***
## number_emergency  0.033182   0.008459   3.923 8.75e-05 ***
## number_inpatient  0.260669   0.006549  39.803  < 2e-16 ***
## number_diagnoses  0.040492   0.006078   6.662 2.71e-11 ***
## A1Cresult>8       0.028387   0.067394   0.421 0.673605    
## A1CresultNone     0.108175   0.056418   1.917 0.055190 .  
## A1CresultNorm    -0.004481   0.073608  -0.061 0.951463    
## metforminNo      -0.109046   0.130935  -0.833 0.404945    
## metforminSteady  -0.246776   0.132530  -1.862 0.062598 .  
## metforminUp      -0.399892   0.172410  -2.319 0.020372 *  
## insulinNo        -0.138316   0.036019  -3.840 0.000123 ***
## insulinSteady    -0.146603   0.033091  -4.430 9.41e-06 ***
## insulinUp        -0.088388   0.039496  -2.238 0.025226 *  
## diabetesMedYes    0.170255   0.033175   5.132 2.87e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 69886  on 99492  degrees of freedom
## Residual deviance: 67403  on 99467  degrees of freedom
## AIC: 67455
## 
## Number of Fisher Scoring iterations: 6

In looking at the summary of the model multiple levels of the A1Cresult are not significant. Also under further investigation this number seems likely to be highly correlated with insulin. This is a similar case with metformin. For those reasons I’ve decided to remove them.

# The more glucose that enters the bloodstream, the higher the amount of
# glycated hemoglobin,” Dr. Dodell says. An A1C level below 5.7 percent is
# considered normal. An A1C between 5.7 and 6.4 percent signals prediabetes.
# Type 2 diabetes is diagnosed when the A1C is over 6.5 percent.

fit.min.lm.12 <- update(fit.min.lm.11, . ~ . - A1Cresult)
Anova(fit.min.lm.12)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                  LR Chisq Df Pr(>Chisq)    
## age                 80.41  9  1.342e-13 ***
## time_in_hospital    34.81  1  3.630e-09 ***
## num_procedures      16.76  1  4.252e-05 ***
## num_medications     13.16  1  0.0002853 ***
## number_emergency    15.28  1  9.262e-05 ***
## number_inpatient  1525.03  1  < 2.2e-16 ***
## number_diagnoses    43.98  1  3.323e-11 ***
## metformin           29.49  3  1.771e-06 ***
## insulin             20.33  3  0.0001452 ***
## diabetesMed         26.04  1  3.350e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit.min.lm.13 <- update(fit.min.lm.12, . ~ . - metformin)
Anova(fit.min.lm.13)
## Analysis of Deviance Table (Type II tests)
## 
## Response: readmitted
##                  LR Chisq Df Pr(>Chisq)    
## age                 85.86  9  1.100e-14 ***
## time_in_hospital    35.17  1  3.021e-09 ***
## num_procedures      14.01  1  0.0001818 ***
## num_medications     10.61  1  0.0011252 ** 
## number_emergency    14.87  1  0.0001151 ***
## number_inpatient  1561.98  1  < 2.2e-16 ***
## number_diagnoses    48.03  1  4.203e-12 ***
## insulin             26.98  3  5.947e-06 ***
## diabetesMed         13.82  1  0.0002009 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
2.0.0.3.1 Final model specified by preliminary model building

$ Y = age + .02 * time_in_hospital - .03 * num_procedures + .005 * num_medications + .033 * number_emergency + .27 * number_inpatient + .04 * number_diagnoses + insulin + diabetesMed - 4.151 $

summary(fit.min.lm.13)
## 
## Call:
## glm(formula = readmitted ~ age + time_in_hospital + num_procedures + 
##     num_medications + number_emergency + number_inpatient + number_diagnoses + 
##     insulin + diabetesMed, family = "binomial", data = data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5785  -0.4944  -0.4433  -0.3981   2.8601  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -4.151458   0.584642  -7.101 1.24e-12 ***
## age[10-20)        0.889354   0.605864   1.468 0.142128    
## age[20-30)        1.424425   0.588665   2.420 0.015531 *  
## age[30-40)        1.409278   0.585919   2.405 0.016162 *  
## age[40-50)        1.332041   0.584615   2.278 0.022697 *  
## age[50-60)        1.276257   0.584303   2.184 0.028945 *  
## age[60-70)        1.437881   0.584173   2.461 0.013840 *  
## age[70-80)        1.498978   0.584121   2.566 0.010282 *  
## age[80-90)        1.512456   0.584315   2.588 0.009642 ** 
## age[90-100)       1.436819   0.587038   2.448 0.014382 *  
## time_in_hospital  0.022443   0.003758   5.971 2.35e-09 ***
## num_procedures   -0.025263   0.006791  -3.720 0.000199 ***
## num_medications   0.005140   0.001572   3.269 0.001079 ** 
## number_emergency  0.032809   0.008436   3.889 0.000101 ***
## number_inpatient  0.265180   0.006509  40.741  < 2e-16 ***
## number_diagnoses  0.041705   0.006065   6.876 6.14e-12 ***
## insulinNo        -0.168857   0.035189  -4.799 1.60e-06 ***
## insulinSteady    -0.151091   0.033016  -4.576 4.73e-06 ***
## insulinUp        -0.092093   0.039477  -2.333 0.019657 *  
## diabetesMedYes    0.116898   0.031464   3.715 0.000203 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 69886  on 99492  degrees of freedom
## Residual deviance: 67444  on 99473  degrees of freedom
## AIC: 67484
## 
## Number of Fisher Scoring iterations: 6
# formula = readmitted ~ age + time_in_hospital + num_procedures +
# num_medications + number_emergency + number_inpatient + number_diagnoses +
# insulin + diabetesMed

3 Appendix